import time
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction import text
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px
# Spacy essentials
nlp_en = spacy.load("en_core_web_sm")
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
def compute_time_difference(time_start, time_end):
"""
Function compute_time_difference: To compute time difference between the two provided timestamps
@param time_start: Start time
@param time_end: End time
@return: time difference string
"""
# Computing time difference
time_diff = time_end - time_start
# Initializing time string to store time in seconds
time_str = str(round(time_diff, 4))+" seconds"
# Checking if the seconds value amounts to more than a minute
if time_diff > 60:
time_str = str(round(time_diff/60, 4))+" minutes"
# Returning time difference string
return time_str
# Total Notebook Run: Start Time
total_time_start = time.time()
news_df = pd.read_csv("data/india-news-headlines.csv")
print("Data dimensions:")
news_df.shape
print("Data types:")
news_df.dtypes
print("First few entries:")
news_df.head()
print("Last few entries:")
news_df.tail()
print("Columnar statistics:")
news_df.describe(include="all")
print("Checking for null values:")
news_df.isnull().sum()
news_cat_values = news_df.headline_category.value_counts().reset_index().values
news_cat_df = pd.DataFrame(news_cat_values, columns=["headline_category", "count"])
news_cat_df.sort_index(ascending=True, inplace=True)
print("Data dimensions:")
news_cat_df.shape
print("First 50 entries:")
news_cat_top_df = news_cat_df.head(50)
fig = px.bar(news_cat_top_df, x="headline_category", y="count")
fig.update_xaxes(tickangle=45)
fig.update_layout(title="Headline category histogram: Top 50")
fig.show()
NOTE: We will be filtering out data for city.bengaluru category.
bng_news_df = news_df.loc[news_df["headline_category"]=="city.bengaluru"]
print("Data dimensions:")
bng_news_df.shape
print("First few entries:")
bng_news_df.head()
# Getting all headline text into one string
bng_all_text = ", ".join([x.lower() for x in bng_news_df["headline_text"]])
# Generating WordCloud
wordcloud = WordCloud(width = 800, height = 800,
background_color ="white",
stopwords = spacy_stopwords,
min_font_size = 10).generate(bng_all_text)
# Plotting the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
def get_top_ngrams(bow, max_features, ngrams, top_k):
'''
Get top n-grams for the given corpora
@param bow: List/Bag of Words
@param max_features: Maximum number of features
@param ngrams: Number of grams
(1=> unigram, 2=>bigram, 3=>trigram)
@param top_k: Number of top n-grams
@return: Top n-grams for given corpora
'''
tfidf = text.CountVectorizer(input=bow,
ngram_range=(ngrams,ngrams),
max_features=max_features,
stop_words="english")
matrix = tfidf.fit_transform(bow)
features = tfidf.get_feature_names()
ngrams_result = pd.Series(np.array(matrix.sum(axis=0))[0],
index=features)
ngrams_top_100 = ngrams_result.sort_values(ascending=False).head(top_k)
return ngrams_top_100
# Bag of Words from bng_news_df
bng_news_text = bng_news_df["headline_text"].tolist()
top_k_bng_bigrams = get_top_ngrams(bow=bng_news_text, max_features=5000, ngrams=2, top_k=20)
top_k_bng_bigrams_df = pd.DataFrame(top_k_bng_bigrams, columns=["count"]).reset_index().rename(columns={"index": "bigram"})
print("Data dimensions:")
top_k_bng_bigrams_df.shape
print("First few entries:")
top_k_bng_bigrams_df.head()
fig = px.bar(top_k_bng_bigrams_df, x="bigram", y="count")
fig.update_xaxes(tickangle=45)
fig.update_layout(title="Top 20 Bigrams")
fig.show()
top_k_bng_trigrams = get_top_ngrams(bow=bng_news_text, max_features=5000, ngrams=3, top_k=20)
top_k_bng_trigrams_df = pd.DataFrame(top_k_bng_trigrams, columns=["count"]).reset_index().rename(columns={"index": "trigram"})
print("Data dimensions:")
top_k_bng_trigrams_df.shape
print("First few entries:")
top_k_bng_trigrams_df.head()
fig = px.bar(top_k_bng_trigrams_df, x="trigram", y="count")
fig.update_xaxes(tickangle=45)
fig.update_layout(title="Top 20 Trigrams")
fig.show()
time_start = time.time()
noun_bng = []
verb_bng = []
for doc in nlp_en.pipe(bng_news_text,n_threads=16,batch_size=1000):
try:
for c in doc:
if c.pos_=="NOUN":
noun_bng.append(c.text.lower())
elif c.pos_=="VERB":
verb_bng.append(c.text.lower())
except:
for c in doc:
noun_bng.append("")
verb_bng.append("")
time_end = time.time()
print("Total time taken to obtain all nouns and verbs: "+compute_time_difference(time_start, time_end))
nouns_df = pd.DataFrame(noun_bng,columns=["noun"])
nouns_df_values = nouns_df.noun.value_counts().reset_index().values
nouns_cnt_df = pd.DataFrame(nouns_df_values, columns=["noun", "count"])
nouns_cnt_df.sort_index(ascending=True, inplace=True)
print("Data dimensions:")
nouns_cnt_df.shape
print("First few entries:")
nouns_cnt_df.head()
top20_nouns_cnt_df = nouns_cnt_df.head(20)
fig = px.bar(top20_nouns_cnt_df, x="noun", y="count")
fig.update_xaxes(tickangle=45)
fig.update_layout(title="Top 20 Nouns")
fig.show()
verbs_df = pd.DataFrame(verb_bng,columns=["verb"])
verbs_df_values = verbs_df.verb.value_counts().reset_index().values
verbs_cnt_df = pd.DataFrame(verbs_df_values, columns=["verb", "count"])
verbs_cnt_df.sort_index(ascending=True, inplace=True)
print("Data dimensions:")
verbs_cnt_df.shape
print("First few entries:")
verbs_cnt_df.head()
top20_verbs_cnt_df = verbs_cnt_df.head(20)
fig = px.bar(top20_verbs_cnt_df, x="verb", y="count")
fig.update_xaxes(tickangle=45)
fig.update_layout(title="Top 20 verbs")
fig.show()
# Total Notebook Run: End Time
total_time_end = time.time()
print("Total time taken to run the entire notebook: "+compute_time_difference(total_time_start, total_time_end))